import os
from llama_index.readers.file import PDFReader

# 指定输入和输出文件夹
pdf_folder = "./pdfs"
output_folder = "./txts"
os.makedirs(output_folder, exist_ok=True)

# 获取所有 PDF 文件
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

pdf_reader = PDFReader()

# 遍历并处理每个 PDF
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    documents = pdf_reader.load_data(pdf_path)

    # 生成对应的 TXT 文件路径
    txt_path = os.path.join(output_folder, pdf_file.replace(".pdf", ".txt"))

    # 写入文本
    with open(txt_path, "w", encoding="utf-8") as f:
        for doc in documents:
            f.write(doc.text)

    print(f"Saved: {txt_path}")